from langchain_community.document_loaders import PyPDFLoader

#pdf加载
loader = PyPDFLoader("data/test.pdf")
pages = loader.load()
pages = loader.load([1,2,3])  #从第0页开始加载到第三页
print(f"总页数:{len(pages)}")


#访问第一页内容
page_content = pages[0].page_content
metadata = pages[0].metadata
print(f"第一页内容:\n{page_content[:200]}...") #预览前200个字符
print(f"元数据:{metadata}")

full_text = "\n\n".join([page.page_content for page in pages])
print(f"全文长度:\n{len(full_text)}字符")

##pdf加载并提取图片
loader = PyPDFLoader("data/pdf-img.pdf", extract_images=True)
pages = loader.load()
print(pages[0].page_content)

#加载全部文档
import os
pdf_folder = "docs/"
all_pages = []
for file_name in os.listdir(pdf_folder):
    if file_name.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(pdf_folder, file_name))
        all_pages.extend(pages)